#HIDDEN
%matplotlib inline
from glob import glob # nbi:hide_in
from IPython.display import Image
from ipywidgets import Text, Dropdown, Output, interact
from matplotlib.collections import LineCollection
from matplotlib import pyplot as plt
from pandas import DataFrame, merge, read_csv, Series, to_datetime
from pickle import dump, load
from random import randint
import cufflinks as cf
import numpy as np
import qgrid
cf.go_offline()
# cf.set_config_file(offline=False, world_readable=True)
cf.set_config_file(offline=True, world_readable=True)
# For metaG
metadataG = read_csv("metadata/GLBRC_MetaG_Metadata.tsv",sep='\t')
metadataG.set_index("nucleic_acid_name",inplace=True)
metadataG.drop(["source","sampling ID","sequencing_type","height_mean_cm","air_temp_c","rep","Sampling Time","reads_file_name","treatment"],axis=1,inplace=True)
for id in metadataG.index: metadataG.loc[id,"type"] = metadataG[metadataG.index == id].plot_name[0][0:2]
metadataG['Date'] = to_datetime(metadataG.sampling_date)
metadataG=metadataG.sort_values(by=["type","Date"])
metadataG.drop(["sampling_date"],axis=1,inplace=True)
metaG_Reads = read_csv("mapping/metaG/hostRemovalFlagstats/multiqc_data/multiqc_bowtie2.txt",sep="\t")
for id in metaG_Reads.index: metaG_Reads.at[id,"Sample"] = metaG_Reads.at[id,"Sample"].replace(".stat","")
metaG_Reads.set_index("Sample",inplace=True)
metadataG = merge(metadataG,metaG_Reads,left_index=True,right_index=True)
metadataG.sort_values("Date",inplace=True)
metadataG["aligned"] = (metadataG['paired_total']-metadataG['paired_aligned_mate_none_halved'])
metadataG["percPlantAligned"]= metadataG["overall_alignment_rate"]
metadataG["Plant Reads"] = metadataG["aligned"]
metadataG["Total"] = metadataG["paired_total"]
metadataG.drop(['aligned','total_reads','paired_total','paired_aligned_none','paired_total','paired_aligned_one', 'paired_aligned_multi',
'paired_aligned_discord_one', 'paired_aligned_mate_none', 'paired_aligned_mate_one', 'paired_aligned_mate_multi',
'overall_alignment_rate', 'paired_aligned_mate_multi_halved', 'paired_aligned_mate_none_halved', 'paired_aligned_mate_one_halved'],axis=1,inplace=True)
metaG_Reads = read_csv("mapping/metaG/fungalRemovalFlagstats/multiqc_data/multiqc_bowtie2.txt",sep="\t")
for id in metaG_Reads.index: metaG_Reads.at[id,"Sample"] = metaG_Reads.at[id,"Sample"].replace(".fungal.stat","")
metaG_Reads.set_index("Sample",inplace=True)
metadataG = merge(metadataG,metaG_Reads,left_index=True,right_index=True)
metadataG.sort_values("Date",inplace=True)
metadataG["Fungal Reads"] = (metadataG['paired_total']-metadataG['paired_aligned_mate_none_halved'])
metadataG["percFungalAligned"]= metadataG["overall_alignment_rate"]
metadataG['percFungalAligned'] = Series([float("{0:.2f}%".format(val).replace("%","")) for val in metadataG['percFungalAligned']], index = metadataG.index)
metadataG['percPlantAligned'] = Series([float("{0:.2f}%".format(val).replace("%","")) for val in metadataG['percPlantAligned']], index = metadataG.index)
metadataG['Remaining'] = metadataG['paired_aligned_mate_none_halved']
metadataG['percRemaining'] = (metadataG["Remaining"]/metadataG['Total'])*100
metadataG.drop(['total_reads','paired_total','paired_aligned_none','paired_total','paired_aligned_one', 'paired_aligned_multi',
'paired_aligned_discord_one', 'paired_aligned_mate_none', 'paired_aligned_mate_one', 'paired_aligned_mate_multi',
'overall_alignment_rate', 'paired_aligned_mate_multi_halved', 'paired_aligned_mate_none_halved', 'paired_aligned_mate_one_halved'],axis=1,inplace=True)
G5Data = metadataG[metadataG['type'] == 'G5']
G6Data = metadataG[metadataG['type'] == 'G6']
# qgrid_widgetMG = qgrid.show_grid(metadataG)
# qgrid_widgetMG
#HIDDEN
metadataT = read_csv("metadata/GLBRC_MetaT_Metadata.tsv",sep='\t')
metadataT.set_index("nucleic_acid_name",inplace=True)
metadataT.drop(["name", "Year","plotID","month","sampleID","day", "year", "time", "weather", "air_temp_c", "notes", "rep", "time_zone", "longitude", \
"latitude", "altitude", "country", "soil_name", "number_cores", "SPNL_date", "pH", "lime_index", "P_ppm", "K_ppm", \
"Ca_ppm", "Mg_ppm", "organic_matter", "NO3N_ppm", "NH4_ppm", "soil_moisture_percent", "soil_temp_10cm", "LDMC_mg_per_g", \
"nitrogen_percent", "carbon_percent", "carbon_per_nitrogen", "height_mean_cm", "mass_per_leaf_g", "date_of_extraction",\
"nucleic_acid_type", "replicate_extraction", "source", "source_mass", "extraction_method", "elution_vol_ul", "concentration_ng_per_ul",\
"ratio_260_280", "conc_ng_per_g_source", "extracted_by", "sequence_name", "sequencing_date", "conc_sent_ng_per_ul", \
"sequencing_type", "sequencing_facility", "primers", "submitted_for_sequencing", "sequencing_successful", "duplicate_submitted",\
"dup_sequencing_name", "exclude_from_analysis", "itemID_JGI", "sampleID_JGI", "barcode", "JGI_library", "HPCC_path", "JGI_taxonOID",\
"JGI_rawdataname", "time_numeric", "date", "precipitation", "Air_temp_mean", "air_temp_max", "Air_Temp_Min",\
"Air_Pressure","plant_name", "RH", "AH", "Wind_Speed_Mean", "Wind_Direction_Mean", "Solar_Radiation", "PAR", "soil_temp_5_cm_bare_avg", \
"soil_temp_5_cm_sod_avg"],axis=1,inplace=True)
for id in metadataT.index: metadataT.loc[id,"type"] = metadataT[metadataT.index == id].plot_name[0][0:2]
metadataT['Date'] = to_datetime(metadataT.sampling_date)
metadataT = metadataT.sort_values(by=["type","Date","treatment","plot_name"])
metadataT.head()
#Reads After host removal
metaT_Reads = read_csv("mapping/metaT/hostRemovalFlagstats/multiqc_data/multiqc_bowtie2.txt",sep="\t")
for id in metaT_Reads.index: metaT_Reads.at[id,"Sample"] = metaT_Reads.at[id,"Sample"].replace(".stat","")
metaT_Reads.set_index("Sample",inplace=True)
metaT_Reads["percPlantAligned"] = ((metaT_Reads['total_reads'] - metaT_Reads['paired_aligned_none']) / metaT_Reads['total_reads'])*100.0
metadataT = merge(metadataT,metaT_Reads,left_index=True,right_index=True)
metadataT.sort_values("Date",inplace=True)
# metadataT.head()
# qgrid_widgetMT = qgrid.show_grid(metadataG)
# qgrid_widgetMT
Step 1. Split JGI files in PE files
Step 1. Align reads to respective host reference assembly
#HIDDEN
# %matplotlib widget
from IPython.core.display import HTML
# disp = {"Percentage":{"Switchgrass & Miscanthus": "html/BothPerc.html", "Switchgrass": "html/SwitchPerc.html", "Miscanthus": "html/MiscanPerc.html"}}
# @interact
# def graph(Source=["MetaG","MetaT"],Host_Plant=["Switchgrass & Miscanthus","Switchgrass","Miscanthus"],By=["Percentage","Count"]): #return HTML(disp[By][Host_Plant])
# if Source == "MetaG": metadata = metadataG
# else: metadata = metaT_metadata
# G5Data = metadata[metadata['type'] == 'G5']
# G6Data = metadata[metadata['type'] == 'G6']
# # htmlName = ""; fig = None
# if By == "Percentage":
# if (Host_Plant == "Switchgrass & Miscanthus"):
# # htmlName = "html/BothPerc.html"
# return metadata.groupby(['Date','plot_name']).sum()['percPlantAligned'].unstack().iplot(asFigure=True,title="% Reads Aligning to Respective Plant Host Assembly G5=Switchgrass G6=Miscanthus");
# elif Host_Plant == "Switchgrass":
# # htmlName = "html/SwitchPerc.html"
# return G5Data.groupby(['Date','plot_name']).sum()['percPlantAligned'].unstack().iplot(asFigure=True,title="% Reads Aligning to Switchgrass Assembly",fontsize=12);
# else: # "Miscanthus":
# # htmlName = "html/MiscanPerc.html"
# return G6Data.groupby(['Date','plot_name']).sum()['percPlantAligned'].unstack().iplot(asFigure=True,title="% Reads Aligning to Micanthus Assembly",fontsize=12);
# # ylim = 100; ylab = "% Reads"
# else:
# colors = ["#74C476", "#f26d07","#794955"]
# if (Host_Plant == "Switchgrass & Miscanthus"):
# # htmlName = "html/BothCount.html"
# return metadata.loc[:,['Plant Reads','Fungal Reads', 'Remaining']].iplot(asFigure=True,kind='bar', barmode = 'stack', color=colors, title="Reads Counts By Source")
# elif Host_Plant == "Switchgrass":
# # htmlName = "html/SwitchCount.html"
# return G5Data.loc[:,['Plant Reads','Fungal Reads', 'Remaining']].iplot(asFigure=True,kind='bar', barmode = 'stack', color=colors, title="Switchgrass Reads Counts")
# else:
# # htmlName = "html/MiscanCount.html"
# return G5Data.loc[:,['Plant Reads','Fungal Reads', 'Remaining']].iplot(asFigure=True,kind='bar', barmode = 'stack', color=colors, title="Miscanthus Reads Counts By Source")
# # py.offline.plot(fig,filename=htmlName)
# # return ax #HTML(htmlName)
HTML(filename="html/BothPerc.html")
Step 1. Extract reads that don't align the plant assembly
#HIDDEN
HTML(filename='html/FungalAlign.html')
#HIDDEN
# #HIDDEN
# import plotly.plotly as py
# fig = metadataG.groupby(['Date','plot_name']).sum()['percFungalAligned'].unstack().iplot(yTitle="Percentage of Reads",title="All Samples Perctages of Reads Aligning to Combined Fungal Assemblies",asFigure=True);
# py.offline.plot(fig,filename="html/FungalAlign.html")
# @interact
# def fungalAlign(Samples=["Switchgrass", "Miscanthus", "Switchgrass & Miscanthus"]):
# metadataG.groupby(['Date','plot_name']).sum()['percFungalAligned'].unstack().iplot(yTitle="Percentage of Reads",title="All Samples Perctages of Reads Aligning to Combined Fungal Assemblies")
# if Samples == "Switchgrass": return G5Data.groupby(['Date','plot_name']).sum()['percFungalAligned'].unstack().iplot(yTitle="Percentage of Reads",title="Switchgrass Samples Perctages of Reads Aligning to Combined Fungal Assemblies");
# elif Samples == "Miscanthus": return G6Data.groupby(['Date','plot_name']).sum()['percFungalAligned'].unstack().iplot(yTitle="Percentage of Reads",title="Miscanthus Samples Perctages of Reads Aligning to Combined Fungal Assemblies");
# else: return metadataG.groupby(['Date','plot_name']).sum()['percFungalAligned'].unstack().iplot(yTitle="Percentage of Reads",title="All Samples Perctages of Reads Aligning to Combined Fungal Assemblies");
The annotations were performed using KEGG's prokaryotic peptides and eukaryotic peptides

Based on the high-overlap, we decided to use the mags instead of the assembly for alignment
checkm lineage_wf -t 20 -x fa Final.contigs.fa.metabat-bins40 metaBinsStats
#HIDDEN
df = read_csv("mags/MAG_Stats.tsv",sep="\t")
df = df.head(100)
df.to_html("html/MAG_Stats.html")
# qgrid_widgetMAGs = qgrid.show_grid(df)
# qgrid_widgetMAGs
HTML(filename="html/MAG_Stats.html")
All bins with at least 50% completeness were used in a combined assembly. The analysis below comes from the counts for each sample. These counts were normalized by the number of reads that did not align to the fungal or host assemblies.

|
|
|
||||||||||||||||||||||||||||||||||||
|
|
|
||||||||||||||||||||||||||||||||||||
Call: adonis(formula = dist.otu ~ map_16S\$time_numeric)
Permutation: free Number of permutations: 999
Terms added sequentially (first to last)
| Df | SumsOfSqs | MeanSqs | F.Model | R2 | Pr(>F) | |
|---|---|---|---|---|---|---|
| map_16S$time_numeric | 1 | 1.9356 | 1.93562 | 47.814 | 0.26298 | 0.001 *** |
| Residuals | 134 | 5.4246 | 0.04048 | 0.73702 | ||
| Total | 135 | 7.3603 | 1.000000 |


Call: adonis(formula = dist.otu ~ map_16S\$treatment)
Permutation: free Number of permutations: 999
Terms added sequentially (first to last)
| Df | SumsOfSqs | MeanSqs | F.Model | R2 | Pr(>F) | |
|---|---|---|---|---|---|---|
| map_16S$treatment | 1 | 0.0289 | 0.028930 | 0.52878 | 0.00393 | 0.755 |
| Residuals | 134 | 7.3313 | 0.054711 | 0.99607 | ||
| Total | 135 | 7.3603 | 1.000000 |
Call: adonis(formula = dist.otu ~ map_16S\$plant)
Permutation: free Number of permutations: 999
Terms added sequentially (first to last)
| Df | SumsOfSqs | MeanSqs | F.Model | R2 | Pr(>F) | |
|---|---|---|---|---|---|---|
| map_16S$plant | 1 | 0.1617 | 0.161704 | 3.0101 | 0.02197 | 0.027 * |
| Residuals | 134 | 7.1986 | 0.053721 | 0.97803 | ||
| Total | 135 | 7.3603 | 1.000000 |

In order for this to progress, 1 had to be complete. The next step is figure out why the metadata for the MetaT is not meshing with the R script.
Kallisto has an arguement to export a bam file when it runs, but I do not know how these bam files work or if they are the same thing as a bam from bowtie2. I am working with kallisto to get a bam file now and then I will see if that file works to extract annotations from the contigs